library(leaflet)
library(tidyverse)
library(RSocrata)
library(leaflet)
library(sf)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date

In this document we get the parks data from the New York City open data repository and do some cleaning. First, we get the parks dataset. The original data dictionary is available at on Github

parks_url <- "https://data.cityofnewyork.us/api/geospatial/y6ja-fw4f?method=export&format=GeoJSON"
parks <- st_read(parks_url, stringsAsFactors = FALSE) %>%
  mutate(shape_area = as.numeric(shape_area))
## Reading layer `OGRGeoJSON' from data source `https://data.cityofnewyork.us/api/geospatial/y6ja-fw4f?method=export&format=GeoJSON' using driver `GeoJSON'
## Simple feature collection with 12491 features and 10 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -74.25537 ymin: 40.49613 xmax: -73.70182 ymax: 40.91138
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs

The New York City open data portal also contains a list of designated trails within parks.

trails <- read_csv("https://data.cityofnewyork.us/api/views/vjbm-hsyr/rows.csv?accessType=DOWNLOAD")
## Parsed with column specification:
## cols(
##   Park_Name = col_character(),
##   Width_ft = col_character(),
##   Class = col_character(),
##   Surface = col_character(),
##   Gen_Topog = col_character(),
##   Difficulty = col_double(),
##   Date_Collected = col_character(),
##   Trail_Name = col_character(),
##   ParkID = col_character(),
##   TrailMarkersInstalled = col_character(),
##   SHAPE = col_character()
## )
trails2 <- jsonlite::fromJSON("https://www.nycgovparks.org/bigapps/DPR_Hiking_001.json")
parks_with_trails <- c(trails$ParkID, trails2$Prop_ID)

Cleaning

The map of Flushing Meadows in Queens below shows some of the difficulties of this dataset: - First, the park is constructed of multiple outlines that limits the size of the total park. This makes Flushing Meadows appear smaller than Central Park, which is a single large park. - Second, the dataset includes tennis parks and baseball fields as separate facilities. We want to include these as attributes of their containing park rather than independent facilities. - Third, parking lots for stadia (in this case Citi Field) are included as open park space. We should remove these as they are categorically unlike other green spaces. - Fourth, important green spaces like cemeteries are in a different dataset.

leaflet(st_buffer(parks %>% filter(grepl("Flushing Meadows", park_name)), dist = 0.000001)) %>%
  addProviderTiles(providers$Esri.WorldGrayCanvas) %>%
  addPolygons(label = ~as.character(park_name), stroke = 0.5)
## Warning in st_buffer.sfc(st_geometry(x), dist, nQuadSegs, endCapStyle =
## endCapStyle, : st_buffer does not correctly buffer longitude/latitude data
## dist is assumed to be in decimal degrees (arc_degrees).

First, we manually remove the park facilities corresponding to stadia and their associated parking lots, and then tag facilities that are not explicitly park boundaries.

notparks <- c(9498000833, 14498000872, 14498000794, # citi field, Yankee stadium, garage
              9498000631)  # USTA Billie Jean Tennis Center

parks <- parks %>% 
  filter(!(source_id %in% notparks)) %>%
  mutate(
    courts = ifelse(grepl("Courts", landuse), TRUE, FALSE),
    playgrounds = ifelse(grepl("Playground", landuse), TRUE, FALSE),
    trails = ifelse(parknum %in% parks_with_trails, TRUE, FALSE)
  ) 

With those areas removed, we build a 30-foot buffer around each park and dissolve interior boundaries based on the park number. Tennis courts, etc. become boolean variables indicating whether the park has the facilities or not. This will make parks that are effectively the same facility a single polygon feature. We also re-calculate the area of the parks in acres and remove parks smaller than half an acre.

dissolved <- parks %>%
  filter(!is.na(parknum)) %>%
  group_by(parknum) %>%
  filter(shape_area > 0.5 * 43560 | courts) %>%
  summarise(
    area = sum(shape_area), 
    courts = any(courts), 
    playgrounds = any(playgrounds),
    trails = any(trails),
    park_name = park_name[1]
  ) %>%
  mutate(sqft = st_area(.), acres = units::set_units(sqft, acres)) %>%
  st_transform(3628)
leaflet(st_transform(dissolved, 4326) ) %>%
  addProviderTiles(providers$Esri.WorldGrayCanvas) %>%
  addPolygons(stroke = 0.5,)

Cemeteries

Some important green spaces that can be used for physical activity but are not parks include cemeteries. These are stored in a separate dataset called Open Space (Other) with the feature code 2500.

cemeteries_url <- "https://data.cityofnewyork.us/api/geospatial/pckb-8r2z?method=export&format=GeoJSON"
cemeteries <- st_read(cemeteries_url) %>%
  filter(feature_co == 2500)
## Reading layer `OGRGeoJSON' from data source `https://data.cityofnewyork.us/api/geospatial/pckb-8r2z?method=export&format=GeoJSON' using driver `GeoJSON'
## Simple feature collection with 4191 features and 7 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -74.24461 ymin: 40.5003 xmax: -73.70169 ymax: 40.91425
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs
dissolved_cemeteries <- cemeteries %>%
  st_transform(3628) %>%
  st_buffer(5) %>%
  mutate(sqft = st_area(.), acres = units::set_units(sqft, acres))
leaflet(st_transform(dissolved_cemeteries, 4326)) %>%
  addProviderTiles(providers$Esri.WorldGrayCanvas) %>%
  addPolygons(stroke = 0.5)

Now we can join the cemeteries data to the parks data and get a single polygons layer.

open_spaces <- rbind(
  dissolved %>% 
    transmute(id = parknum, name = park_name, acres, courts, playgrounds, trails, 
              type = "park"),
  dissolved_cemeteries %>% 
    transmute(id = source_id, name, acres, courts = FALSE, playgrounds = FALSE, 
              trails = FALSE, type = "cemetery")
  ) %>%
  filter(as.numeric(acres) > 0.5) %>%
  st_simplify(preserveTopology = TRUE)

Tweet Counts

We obtained geolocated tweets sent from within the boundaries of each park for the period from July through October, 2014.

tweets <- read_csv("data/tweet_counts.csv") %>%
  filter(id %in% open_spaces$id) %>%
  gather(date, count, PK_7_1:PK_10_31) %>%
  separate(date, c("pk", "month", "day"), sep = "_") %>%
  mutate(date = as_date(str_c("2014", month, day, sep = "-"))) %>%
  dplyr::select(id, date, count) %>%
  mutate(
    month = month(date, TRUE),
    wday = wday(date, TRUE)
  ) 
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   id = col_character(),
##   name = col_character(),
##   type = col_character()
## )
## See spec(...) for full column specifications.

We have a question about how consistent the twitter data is by month or across days of the week. Somewhat surprisingly, the twitter activity is lowest on saturday, and then next lowest on Sunday.

tweets %>%
  ggplot(aes(x = wday)) +
  geom_bar()

August is very low! Let’s make sure to use September or October.

tweets %>%
  ggplot(aes(x = month)) +
  geom_bar()

Let’s look at the within-park consistency or variability of twitter activity. It appears that there is some variation, but that parks with high use typically have high use.

tweets %>%
  filter(month == "Sep") %>%
  ggplot(aes(x = date, y = count + 1, group = id)) +
  geom_line(alpha = 0.5) + scale_y_log10()

Let’s calculate the daily average for each park as well as the total tweets in September and then go with that.

(tweets_summary <- tweets %>%
  filter(month == "Sep") %>%
  filter(!wday %in% c("Sat", "Sun")) %>%
  group_by(id) %>%
  summarise(
    tweet_average = mean(count),
    tweet_count = sum(count)
  ) %>%
  arrange(-tweet_count))
## # A tibble: 1,276 x 3
##    id    tweet_average tweet_count
##    <chr>         <dbl>       <dbl>
##  1 M010           72.4        1593
##  2 M231           35.7         785
##  3 X030           35.5         780
##  4 M232           19.4         427
##  5 M093           16.9         372
##  6 M008           14.1         310
##  7 M098           13.5         298
##  8 Q099           12.2         269
##  9 M013           10.6         233
## 10 M089           10.6         233
## # … with 1,266 more rows
ggplot(tweets_summary, aes(x = tweet_count)) +
  geom_histogram() 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

open_spaces <- open_spaces %>%
  left_join(tweets_summary, by = "id") %>%
  mutate_at(
    vars(starts_with("tweet")), 
    list(~replace_na(., 0))
  ) 

Now we write out the data for the future.

write_rds(open_spaces, "data/open_spaces.rds")
geojsonio::geojson_write(open_spaces, file = "data/open_spaces.geojson")
## Registered S3 method overwritten by 'geojsonio':
##   method         from 
##   print.location dplyr
## Success! File is at data/open_spaces.geojson
## <geojson-file>
##   Path:       data/open_spaces.geojson
##   From class: geo_list